Pbmc media2

ScRNA seq data from Calini representing PBMCs from 2 individuals (male age 60,52) in different storing conditions: - fresh - fixed with MetOH - stored in 15% DMSO

Libraries

suppressPackageStartupMessages({
    library(pheatmap)
    library(scater)
    library(dplyr)
    library(reshape2)
    library(ggplot2)
    library(cowplot)
    library(mvoutlier)
    library(Matrix)
    library(purrr)
    library(gplots)
    library(scran)
    library(Seurat)
    library(mclust)
    library(readxl)
    library(DropletUtils)
    library(magrittr)
    library(LSD)
    library(CellMixS)
})

Parameter

datadir <- "/home/Shared_s3it/_home_Shared_taupo_data_seq/calini_scrnaseq/FGCZ_human"
data_path <- "/home/aluetg/scRNA/pbmc_media2/data"
meta_dir <- "20190312 NovaSeqRun Samples Characteristics.xlsx"
  
seed <- 1234

Data

Load data and generate a SingleCellExperiment object from them

samples <- list.files(datadir, pattern="CR05", full.names = TRUE)
samples <- samples %>% extract(.!=paste0(datadir, "/CR059"))
samples <- samples %>% extract(.!=paste0(datadir, "/CR052"))
names(samples) <- basename(samples)
samples <- paste0(samples, "/outs/filtered_feature_bc_matrix")
samples <- samples[file.exists(paste0(samples, "/matrix.mtx.gz"))]

sce <- DropletUtils::read10xCounts(samples=samples)
sce$dataset <- basename(gsub("/outs/filtered_feature_bc_matrix","",as.character(sce$Sample)))

meta <- read_excel(paste0(datadir, "/", meta_dir))
sce$Sample <- meta[["Sample Name"]][match(sce$dataset, meta$`Sequencing ID`)]
sce$group <- meta[["Sample Characteristics"]][match(sce$dataset, meta$`Sequencing ID`)]
sce$Sample <- factor(paste(sce$Sample, sce$group))
sce$dataset <- factor(sce$dataset)
colnames(sce) <- paste0(sce$dataset, ".", sce$Barcode)
rownames(sce) <- paste0(rowData(sce)$ID, ".", rowData(sce)$Symbol)

##Add patient id
sce$patient <- ifelse(grepl("D301089", sce$Sample), "pat1", "pat2")
sce$media <- ifelse(grepl("Fresh", sce$Sample), "fresh", ifelse(grepl("DMSO", sce$Sample), "DMSO", "MetOH"))

## Filter out genes that are not expressed in any cell
sce <- sce[which(rowSums(counts(sce) > 0) > 0), ]
dim(sce)
## [1] 23886 40717

QC and filtering

## Mitochondrial genes
## Find mitochondrial genes
is.mito <- grepl("\\MT-", rownames(sce))
summary(is.mito)
##    Mode   FALSE    TRUE 
## logical   23873      13
rownames(sce)[is.mito]
##  [1] "ENSG00000198888.MT-ND1"  "ENSG00000198763.MT-ND2" 
##  [3] "ENSG00000198804.MT-CO1"  "ENSG00000198712.MT-CO2" 
##  [5] "ENSG00000228253.MT-ATP8" "ENSG00000198899.MT-ATP6"
##  [7] "ENSG00000198938.MT-CO3"  "ENSG00000198840.MT-ND3" 
##  [9] "ENSG00000212907.MT-ND4L" "ENSG00000198886.MT-ND4" 
## [11] "ENSG00000198786.MT-ND5"  "ENSG00000198695.MT-ND6" 
## [13] "ENSG00000198727.MT-CYB"
sce <- calculateQCMetrics(sce, feature_controls = list(Mt = is.mito))
plotHighestExprs(sce,n=30)

Filtering

  1. Find outlier
#Plot functions
outlierPlot <- function(cd, feature, aph=NULL, logScale=FALSE, show.legend=TRUE){
  if(is.null(aph)) aph <- paste0(feature, "_drop")
  if(!(aph %in% colnames(cd))) aph <- NULL
  p <-  ggplot(as.data.frame(cd), aes_string(x = feature, alpha = aph)) + 
      geom_histogram(show.legend=show.legend)
  if(!is.null(aph)) p <- p + scale_alpha_manual(values = c("TRUE" = 0.4, "FALSE" = 1))
  if(logScale) p <- p + scale_x_log10()
  p
}

plQCplot <- function(cd, show.legend=TRUE){
  ps <- lapply(split(cd,cd$dataset), sl=show.legend, FUN=function(x,sl){
    list( outlierPlot( x, "total_counts", logScale=T, show.legend=sl),
          outlierPlot( x, "total_features_by_counts", "total_features_drop", 
                       logScale=T, show.legend=sl),
          outlierPlot( x, "pct_counts_Mt", "mito_drop", show.legend=sl)
          )
  })
  plot_grid( plotlist = do.call(c, ps), 
             labels=rep(basename(names(ps)), each=length(ps[[1]])), 
             ncol=length(ps[[1]]), 
             label_x=0.5 )
}

#Filtering
sce$total_counts_drop <- isOutlier(sce$total_counts, nmads = 2.5, 
                                   type = "both", log = TRUE, batch=sce$dataset)

sce$total_features_drop <- isOutlier(sce$total_features_by_counts, nmads = 2.5, 
                                   type = "both", log = TRUE, batch=sce$dataset)

sce$mito_drop <- sce$pct_counts_Mt > 5 &
                 isOutlier(sce$pct_counts_Mt, nmads = 2.5, type = "higher", batch=sce$dataset)

sce$isOutlier <- sce$total_counts_drop | sce$total_features_drop | sce$mito_drop

#Plot
#pdf(file = "/home/aluetg/scRNA/pbmc_media2/scripts/pbmc_media2_pre_qc_files/figure-html/qc_plots.pdf", width = 15, height = 50, paper = "letter")
p <- plQCplot(colData(sce), show.legend=FALSE)
p

save_plot("/home/aluetg/scRNA/pbmc_media2/data/res/qc_filtering.png", p, ncol = 3, nrow = length(levels(sce$dataset)))
#dev.off()

Total counts vs mito genes

plotColData(sce, x = "total_features_by_counts", y = "pct_counts_Mt",colour = "dataset")

plotColData(sce, x = "total_features_by_counts", y = "pct_counts_Mt",colour = "media")

plotColData(sce, x = "total_counts", y = "pct_counts_Mt",colour = "media")

plotColData(sce, x = "total_counts", y = "pct_counts_Mt",colour = "dataset")

  1. Check outlier
mets <- c("total_counts_drop","total_features_drop","mito_drop")
sapply(mets, FUN=function(x){ sapply(mets, y=x, function(x,y){ sum(sce[[x]] & sce[[y]]) }) })
##                     total_counts_drop total_features_drop mito_drop
## total_counts_drop                4809                4100       904
## total_features_drop              4100                6137       813
## mito_drop                         904                 813      2705
nbcells <- cbind(table(sce$Sample),table(sce$Sample[!sce$isOutlier]))
colnames(nbcells) <- c("cells total","cells after filtering")
nbcells
##                                                                   cells total
## Fresh sample D301089 M age 52 (1967) Purified PBMC                       8197
## Fresh sample D301093 M age 60 (1959) Purified PBMC                       4778
## Sample D301089 M age 52 (1967) fixed with MetOH Purified PBMC            8807
## Sample D301089 M age 52 (1967) stored with 15% DMSO Purified PBMC        7700
## Sample D301093 M age 60 (1959) fixed with MetOH Purified PBMC            3277
## Sample D301093 M age 60 (1959) stored with 15% DMSO Purified PBMC        7958
##                                                                   cells after filtering
## Fresh sample D301089 M age 52 (1967) Purified PBMC                                 6169
## Fresh sample D301093 M age 60 (1959) Purified PBMC                                 3265
## Sample D301089 M age 52 (1967) fixed with MetOH Purified PBMC                      8170
## Sample D301089 M age 52 (1967) stored with 15% DMSO Purified PBMC                  6264
## Sample D301093 M age 60 (1959) fixed with MetOH Purified PBMC                      2406
## Sample D301093 M age 60 (1959) stored with 15% DMSO Purified PBMC                  5865
layout(matrix(1:2,nrow=1))
LSD::heatscatter( sce$total_counts, sce$total_features_by_counts, xlab="Total counts", ylab="Non-zero features", main="",log="xy")
w <- which(!sce$isOutlier)
LSD::heatscatter( sce$total_counts[w], sce$total_features_by_counts[w], xlab="Total counts", ylab="Non-zero features", main="Filtered cells",log="xy")

  1. Filter outlier
sce <- sce[,!sce$isOutlier]
sce <- sce[which(rowSums(counts(sce)>1)>=20),]
dim(sce)
## [1]  8331 32139
table(sce$dataset)
## 
## CR053 CR054 CR055 CR056 CR057 CR058 
##  6169  3265  6264  5865  8170  2406

Scater

Normalize data

#Normalize all datasets together 
sce <- computeSumFactors(sce)
## Warning in FUN(...): encountered negative size factor estimates

## Warning in FUN(...): encountered negative size factor estimates

## Warning in FUN(...): encountered negative size factor estimates

## Warning in FUN(...): encountered negative size factor estimates
sce <- normalize(sce)

Highly variable genes

fit_sce <- trendVar(sce, use.spikes=FALSE) 
dec_sce <- decomposeVar(sce, fit_sce)
dec_sce$Symbol <- rowData(sce)$Symbol
dec_sce <- dec_sce[order(dec_sce$bio, decreasing = TRUE), ]
hvg <- rownames(dec_sce)[dec_sce$bio > 0]
metadata(sce)$hvg_genes <- hvg

Reduced dimsnsion

sce <- runPCA(sce, ncomponents = 20, feature_set = hvg)
sce <- runUMAP(sce, feature_set = hvg)
sce <- runTSNE(sce, feature_set = hvg)

visGroup(sce, "dataset")

visGroup(sce, "media")

visGroup(sce, "patient")

visGroup(sce, "dataset", dim_red = "UMAP")

visGroup(sce, "patient", dim_red = "UMAP")

visGroup(sce, "media", dim_red = "UMAP")

visGroup(sce, "dataset", dim_red = "PCA")

Seurat

seurat <- CreateSeuratObject(counts = counts(sce), meta.data = as.data.frame(colData(sce)), min.cells = 0, min.features = 0, project = "10X", names.delim = ".")
seurat <- NormalizeData(object = seurat)

hvg Seurat

seurat <- FindVariableFeatures(object = seurat)
seurat <- ScaleData(object = seurat, verbose = FALSE)

hvg_seurat <- seurat@assays$RNA@var.features
hvg_overlap <- intersect(hvg_seurat, hvg)
length(hvg_overlap)
## [1] 810
length(hvg_seurat)
## [1] 2000
length(hvg)
## [1] 3636
head(hvg_overlap)
## [1] "ENSG00000244734.HBB"    "ENSG00000188536.HBA2"  
## [3] "ENSG00000206172.HBA1"   "ENSG00000163220.S100A9"
## [5] "ENSG00000118785.SPP1"   "ENSG00000275302.CCL4"

Reduced dimension seurat

seurat <- RunPCA(object = seurat, npcs = 20, verbose = FALSE)
seurat <- FindNeighbors(object = seurat, reduction = "pca", dims = 1:20)
## Computing nearest neighbor graph
## Computing SNN
seurat <- FindClusters(object = seurat, resolution = 0.2, random.seed = seed)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 32139
## Number of edges: 1165442
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9617
## Number of communities: 16
## Elapsed time: 10 seconds
seurat <- RunTSNE(object = seurat, perplexity = 30, reduction = "pca", dims = 1:20)
seurat <- RunUMAP(object = seurat, reduction = "pca", dims = 1:10, n.neighbors = 30, min.dist = 0.5)

plot_grid(
  DimPlot(seurat, reduction = "tsne", group.by="dataset", label = T, label.size = 4, do.return = T),
  DimPlot(seurat, reduction = "tsne", group.by="RNA_snn_res.0.2", do.return = T),
  DimPlot(seurat, reduction = "umap", group.by="dataset", do.return = T),
  DimPlot(seurat, reduction = "umap", group.by="RNA_snn_res.0.2", label = T, label.size = 4, do.return = T)
)

Cluster marker

marker_genes <- FindAllMarkers( object = seurat, only.pos = FALSE, 
                             min.pct = 0.25, 
                             resolution = 0.2,
                             logfc.threshold = 0.5, test.use = "wilcox",
                             max.cells.per.ident = 300 )
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
## Calculating cluster 11
## Calculating cluster 12
## Calculating cluster 13
## Calculating cluster 14
## Calculating cluster 15
saveRDS(marker_genes, paste0(data_path, "pbmc_media2_marker.rds"))

plot marker genes

getTopMarkers <- function(x){
  x <- x[which(x$p_val_adj<0.05),]
  x <- x[order(x$p_val),]
  x <- lapply(split(x, x$cluster, drop=F),FUN=function(x){ head(x$gene) })
  unique(unlist(x))
}
hmWrapper <- function(seurat, genes, nbPerSample=10, slot="data", useHighCoverage=T){
    seurat$cluster <- Idents(seurat)
    m <- seurat@meta.data[,c("Sample","total_counts","cluster")]
    if(useHighCoverage) m <- m[order(m$total_counts, decreasing=T),]
    cells <- lapply( unique(seurat$Sample),
                     m=m,
                     FUN=function(x,m){
        m <- m[which(m$Sample==x),]
        
        m <- split(row.names(m), m$cluster)
        unlist(lapply(m, n=nbPerSample, head))
    })
    DoHeatmap( object=seurat, features=genes, cells=unlist(cells), slot=slot )
}
hmWrapper(seurat, getTopMarkers(marker_genes))

###Convert seurat to sce

sce2 <- SingleCellExperiment(
          assays=list(
            counts=seurat@assays$RNA@counts,
            logcounts=seurat@assays$RNA@data
          ),
          colData=seurat@meta.data,
          reducedDims=lapply(seurat@reductions, FUN=function(x) x@cell.embeddings)
    )

saveRDS(sce2, file = paste0(data_path, "/objects/sce_from_seurat_pbmc_media2.rds"))

#assign cluster to sce
identical(colnames(sce), colnames(seurat))
## [1] TRUE
sce$seurat_cluster <- seurat@meta.data$seurat_clusters

Save data

saveRDS(sce, file = paste0(data_path, "/objects/sce_pbmc_media2.rds"))
saveRDS(seurat, file = paste0(data_path, "/objects/seurat_pbmc_media2.rds"))
sessionInfo()
## R version 3.6.0 alpha (2019-04-08 r76348)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 16.04.5 LTS
## 
## Matrix products: default
## BLAS:   /home/aluetg/R/lib/R/lib/libRblas.so
## LAPACK: /home/aluetg/R/lib/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_CA.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_CA.UTF-8        LC_COLLATE=en_CA.UTF-8    
##  [5] LC_MONETARY=en_CA.UTF-8    LC_MESSAGES=en_CA.UTF-8   
##  [7] LC_PAPER=en_CA.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_CA.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] CellMixS_0.99.5             kSamples_1.2-8             
##  [3] SuppDists_1.1-9.4           LSD_4.0-0                  
##  [5] magrittr_1.5                DropletUtils_1.3.14        
##  [7] readxl_1.3.1                mclust_5.4.3               
##  [9] Seurat_3.0.0.9100           scran_1.11.26              
## [11] gplots_3.0.1.1              purrr_0.3.2                
## [13] Matrix_1.2-17               mvoutlier_2.0.9            
## [15] sgeostat_1.0-27             cowplot_0.9.4              
## [17] reshape2_1.4.3              dplyr_0.8.0.1              
## [19] scater_1.11.16              ggplot2_3.1.1              
## [21] SingleCellExperiment_1.5.2  SummarizedExperiment_1.13.0
## [23] DelayedArray_0.9.9          BiocParallel_1.17.18       
## [25] matrixStats_0.54.0          Biobase_2.43.1             
## [27] GenomicRanges_1.35.1        GenomeInfoDb_1.19.3        
## [29] IRanges_2.17.4              S4Vectors_0.21.23          
## [31] BiocGenerics_0.29.2         pheatmap_1.0.12            
## 
## loaded via a namespace (and not attached):
##   [1] reticulate_1.11.1        R.utils_2.8.0           
##   [3] tidyselect_0.2.5         htmlwidgets_1.3         
##   [5] grid_3.6.0               trimcluster_0.1-2.1     
##   [7] ranger_0.11.2            Rtsne_0.15              
##   [9] munsell_0.5.0            codetools_0.2-16        
##  [11] ica_1.0-2                statmod_1.4.30          
##  [13] future_1.12.0            sROC_0.1-2              
##  [15] withr_2.1.2              colorspace_1.4-1        
##  [17] knitr_1.22               ROCR_1.0-7              
##  [19] robustbase_0.93-4        vcd_1.4-4               
##  [21] VIM_4.8.0                gbRd_0.4-11             
##  [23] listenv_0.7.0            Rdpack_0.10-1           
##  [25] labeling_0.3             GenomeInfoDbData_1.2.1  
##  [27] cvTools_0.3.2            rhdf5_2.27.15           
##  [29] xfun_0.6                 diptest_0.75-7          
##  [31] R6_2.4.0                 ggbeeswarm_0.6.0        
##  [33] robCompositions_2.0.10   rsvd_1.0.0              
##  [35] locfit_1.5-9.1           flexmix_2.3-15          
##  [37] bitops_1.0-6             reshape_0.8.8           
##  [39] assertthat_0.2.1         SDMTools_1.1-221        
##  [41] scales_1.0.0             nnet_7.3-12             
##  [43] beeswarm_0.2.3           gtable_0.3.0            
##  [45] npsurv_0.4-0             globals_0.12.4          
##  [47] rlang_0.3.4              splines_3.6.0           
##  [49] lazyeval_0.2.2           yaml_2.2.0              
##  [51] abind_1.4-5              tools_3.6.0             
##  [53] zCompositions_1.2.0      RColorBrewer_1.1-2      
##  [55] dynamicTreeCut_1.63-1    ggridges_0.5.1          
##  [57] Rcpp_1.0.1               plyr_1.8.4              
##  [59] zlibbioc_1.29.0          RCurl_1.95-4.12         
##  [61] pbapply_1.4-0            viridis_0.5.1           
##  [63] zoo_1.8-5                haven_2.1.0             
##  [65] ggrepel_0.8.0            cluster_2.0.8           
##  [67] RSpectra_0.14-0          data.table_1.12.2       
##  [69] openxlsx_4.1.0           lmtest_0.9-36           
##  [71] RANN_2.6.1               truncnorm_1.0-8         
##  [73] mvtnorm_1.0-10           fitdistrplus_1.0-14     
##  [75] hms_0.4.2                lsei_1.2-0              
##  [77] evaluate_0.13            rio_0.5.16              
##  [79] gridExtra_2.3            compiler_3.6.0          
##  [81] tibble_2.1.1             KernSmooth_2.23-15      
##  [83] crayon_1.3.4             R.oo_1.22.0             
##  [85] htmltools_0.3.6          pcaPP_1.9-73            
##  [87] tidyr_0.8.3              rrcov_1.4-7             
##  [89] RcppParallel_4.4.2       MASS_7.3-51.4           
##  [91] fpc_2.1-11.1             boot_1.3-22             
##  [93] car_3.0-2                R.methodsS3_1.7.1       
##  [95] gdata_2.18.0             metap_1.1               
##  [97] igraph_1.2.4             forcats_0.4.0           
##  [99] pkgconfig_2.0.2          foreign_0.8-71          
## [101] laeken_0.5.0             sp_1.3-1                
## [103] plotly_4.9.0             vipor_0.4.5             
## [105] dqrng_0.1.1              XVector_0.23.2          
## [107] bibtex_0.4.2             NADA_1.6-1              
## [109] stringr_1.4.0            digest_0.6.18           
## [111] RcppAnnoy_0.0.11         pls_2.7-1               
## [113] tsne_0.1-3               rmarkdown_1.12          
## [115] cellranger_1.1.0         uwot_0.1.3              
## [117] edgeR_3.25.3             DelayedMatrixStats_1.5.2
## [119] listarrays_0.2.0         curl_3.3                
## [121] kernlab_0.9-27           gtools_3.8.1            
## [123] modeltools_0.2-22        nlme_3.1-139            
## [125] jsonlite_1.6             Rhdf5lib_1.5.4          
## [127] carData_3.0-2            BiocNeighbors_1.1.13    
## [129] viridisLite_0.3.0        limma_3.39.14           
## [131] pillar_1.3.1             lattice_0.20-38         
## [133] GGally_1.4.0             httr_1.4.0              
## [135] DEoptimR_1.0-8           survival_2.44-1.1       
## [137] glue_1.3.1               zip_2.0.1               
## [139] png_0.1-7                prabclus_2.2-7          
## [141] class_7.3-15             stringi_1.4.3           
## [143] HDF5Array_1.11.11        BiocSingular_0.99.15    
## [145] caTools_1.17.1.2         irlba_2.3.3             
## [147] e1071_1.7-1              future.apply_1.2.0      
## [149] ape_5.3